{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-7f558e8f3d6ac7e9",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "# Exercise 05) Temporal-Difference Learning\n",
    "\n",
    "In this exercise we again use the racetrack environment developed in the first homework.\n",
    "You can either use our provided environment or your own solution from the homework assignment.  \n",
    "\n",
    "For starting, please execute the following cells. \n",
    "There, we will build the more complex rectangular course which was used in the last task of exercise 04.\n",
    "\n",
    "A dummy policy is defined, which turns the car to the right in front of a wall.\n",
    "As a reminder the action encoding can be seen in the following picture:\n",
    "\n",
    "![](Directions_Legend.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-6cd0709c047c84bf",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "from racetrack_environment import RaceTrackEnv\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import build_rect_course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the course\n",
    "_course_dim = (8, 10)\n",
    "_inner_wall_dim = (2, 6)\n",
    "\n",
    "\n",
    "course = build_rect_course(_course_dim, _inner_wall_dim)\n",
    "track = RaceTrackEnv(course)\n",
    "dummy_slow_pi = np.ones([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY]) * 4\n",
    "\n",
    "y_size, x_size = track.bounds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setup dummy policy\n",
    "\n",
    "dummy_slow_pi[:track.bounds[0]//2, :, 0 , 0] = 5 # turn right\n",
    "dummy_slow_pi[:track.bounds[0]//2, -2:, 0 , :] = 6 # turn down left\n",
    "dummy_slow_pi[-2:, track.bounds[1]//2:, : , 0] = 0 # turn up left\n",
    "dummy_slow_pi[track.bounds[0]//2:, :2, 0, :] = 2 # turn up right\n",
    "dummy_slow_pi[:2, 0:track.bounds[1]//2, :, 0] = 8 # turn down right\n",
    "\n",
    "pi = dummy_slow_pi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-4c2c49205b405e2e",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# Run learned policy on test case\n",
    "pos_map = np.zeros((y_size, x_size))\n",
    "state = track.reset()\n",
    "for k in range(2000):\n",
    "    pos_map[state[0], state[1]] += 1  # exploration map\n",
    "    act = track.action_to_tuple(pi[state])\n",
    "    state, reward, terminated, truncated, _ = track.step(act)\n",
    "    if truncated: state = track.reset()\n",
    "    if terminated: break    \n",
    "\n",
    "for row in course:\n",
    "    print(row)\n",
    "\n",
    "print('\\n \\n Sample trajectory on dummy policy:')\n",
    "pos_map = (pos_map > 0).astype(np.float32)\n",
    "pos_map +=  track.course  # overlay track course\n",
    "plt.imshow(pos_map, cmap='hot', interpolation='nearest')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-ca8dcc7e0dc05f9d",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 1) TD-Based Prediction (Policy Evaluation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-7197a6aac11887b4",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Write a TD-based prediction algorithm to evaluate the dummy policy using $\\alpha = 0.2$ and $\\gamma = 1$ and calculate the state values.\n",
    "\n",
    "After how many episodes do the state values converge?\n",
    "Compare this to Monte-Carlo first visit prediciton from exercise 04.\n",
    "\n",
    "Change $\\alpha$ to $1$? Does it work? Explain!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-9f2519bdb5105516",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 1) Solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_print_pos_map(_pos_map):\n",
    "    \"\"\"Function to print the state values\"\"\"\n",
    "    for row in _pos_map:\n",
    "        print(' '.join(x_size*['{}']).format(*[str(int(r)).zfill(3) for r in row]))\n",
    "\n",
    "def plot_pos_map(_pos_map):\n",
    "    \"\"\"# Function to plot the heatmap\"\"\"\n",
    "    plt.imshow(_pos_map, cmap='hot', interpolation='nearest')\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "def interact(pi, state):\n",
    "    \"\"\"Interact with the environment to get to the next state.\n",
    "\n",
    "    Args:\n",
    "        pi: The policy to follow\n",
    "        state: The current state before interaction\n",
    "\n",
    "    Returns:\n",
    "        next_state: The next state after interaction\n",
    "        reward: The reward for the current interaction\n",
    "        terminated: If the goal was reached\n",
    "        truncated: If the boundary of the track was breached\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "    return next_state, reward, terminated, truncated\n",
    "\n",
    "def learn(values, state, next_state, reward, gamma, alpha):\n",
    "    \"\"\"Learn from the collected data using TD0-based prediction.\n",
    "    \n",
    "    Args:\n",
    "        values: The state-value estimates before the current update\n",
    "        state: The state before the last interaction\n",
    "        next_state: The state after the last interaction\n",
    "        reward: The reward for the last interaction\n",
    "        gamma: Discount factor\n",
    "        alpha: Forgetting factor\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "\n",
    "    return values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-9420a2eab6df4bfe",
     "locked": false,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# Initialise state values \n",
    "values = np.zeros([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY])\n",
    "\n",
    "# Configuration parameters\n",
    "gamma = 1\n",
    "alpha = 1\n",
    "\n",
    "# Initialise race track course\n",
    "course = course\n",
    "track = RaceTrackEnv(course)\n",
    "x_size, y_size = len(course[0]), len(course)\n",
    "\n",
    "pos_map = np.zeros((y_size, x_size))\n",
    "\n",
    "episodes = 250\n",
    "\n",
    "for e in tqdm(range(episodes), desc='episode'): \n",
    "        \n",
    "    # initialize x0\n",
    "    state = track.reset()\n",
    "  \n",
    "    # episodes do not terminate by time limit\n",
    "    while True:\n",
    "        next_state, reward, terminated, truncated = interact(pi, state)\n",
    "\n",
    "        if truncated: \n",
    "            next_state = track.reset()\n",
    "\n",
    "        values = learn(values, state, next_state, reward, gamma, alpha)        \n",
    "        \n",
    "        # x_k = x_k+1\n",
    "        state = next_state\n",
    "\n",
    "        if terminated:\n",
    "            break\n",
    "\n",
    "    # update map\n",
    "    for s_x in range(x_size):\n",
    "        for s_y in range(y_size):\n",
    "            pos_map[s_y, s_x] = np.min(values[s_y, s_x, :, :])\n",
    "     \n",
    "text_print_pos_map(pos_map)\n",
    "  \n",
    "# Plot heatmap in the end\n",
    "plot_pos_map(-pos_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-6bef3d58f89a83b6",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 2) On-Policy $\\varepsilon$-Greedy Control: "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-0ad52e3055e4119c",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Write a temporal-difference based $\\varepsilon$-greedy control algorithm to solve the rectangular course environment used above. \n",
    "Use $\\varepsilon = 0.1$, $\\alpha = 0.5$ and $\\gamma = 1$ and run $500$ episodes.\n",
    "\n",
    "Note that no initial policy is needed for TD control methods. \n",
    "\n",
    "\n",
    "Does Sarsa perform good at learning an optimal greedy policy?\n",
    "\n",
    "Change $\\alpha$ to $0.1$ and $0.9$. What do you recognize? Explain!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-4b7cde2b63748115",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 2) Solution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": true,
     "grade_id": "cell-88c20c864f47389e",
     "locked": false,
     "points": 0,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "source": [
    "Algorithm given below.\n",
    "\n",
    "As a soft exploration policy $\\pi$ is used for training, the $q_\\pi$ that is derived from that will be biased due to $\\pi$ being soft and not optimally greedy.\n",
    "\n",
    "Changing $\\alpha$ to 0.1 will result in a less biased policy. \n",
    "But the training will need more time because the action value updates are smaller due to the lower learning rate $\\alpha$:\n",
    "\n",
    "$q(x_\\mathrm{k}, u_\\mathrm{k}) = q(x_\\mathrm{k}, u_\\mathrm{k}) + \\alpha [r_\\mathrm{k+1} + \\gamma q(x_\\mathrm{k+1}, u_\\mathrm{k+1}) - q(x_\\mathrm{k}, u_\\mathrm{k}) ] $"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def policy(action_values, state, deterministic, epsilon):\n",
    "    \"\"\"Decides on one of the actions in dependence of the current state.\n",
    "\n",
    "    Args:\n",
    "        action_values: The current action values\n",
    "        state: The state vector\n",
    "        deterministic: Whether actions are chosen deterministically or eps-greedily\n",
    "        epsilon: Probability for random action in eps-greedy\n",
    "\n",
    "    Returns:\n",
    "        action: The chosen action\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "    return action\n",
    "\n",
    "\n",
    "def learn(action_values, state_action, next_state_action, reward, gamma, alpha):\n",
    "    \"\"\"Learn from the collected data using TD0-based prediction.\n",
    "    \n",
    "    Args:\n",
    "        action_values: The action-value estimates before the current update\n",
    "        state_action: The state+action before the last interaction\n",
    "        next_state_action: The state+action for the next interaction\n",
    "        reward: The reward for the last interaction\n",
    "        gamma: Discount factor\n",
    "        alpha: Forgetting factor\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "    return action_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-44607222bc50aac7",
     "locked": false,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# Initialise action values \n",
    "action_values = np.zeros([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY, 3, 3])\n",
    "\n",
    "cumulated_rewards = []\n",
    "q_sample = []\n",
    "\n",
    "# configuration parameters\n",
    "epsilon = 0.1   # exploration probability\n",
    "gamma = 1       # discount factor\n",
    "alpha = 0.5     # forgetting factor\n",
    "n_episodes = 500 # number of evaluated episodes\n",
    "\n",
    "# define track\n",
    "course = course\n",
    "track = RaceTrackEnv(course)\n",
    "x_size, y_size = len(course[0]), len(course)\n",
    "\n",
    "\n",
    "for e in tqdm(range(n_episodes), desc='episode'): \n",
    "    cumulated_reward = 0\n",
    "    \n",
    "    pos_map = np.zeros((y_size, x_size))\n",
    "    \n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "\n",
    "    # get first action\n",
    "\n",
    "    # episodes do not terminate by time limit\n",
    "    while True:\n",
    "        pos_map[state[0], state[1]] += 1  # exploration map\n",
    "\n",
    "        # take step in track\n",
    "\n",
    "        # get action for the next step\n",
    "\n",
    "        #learn\n",
    "        \n",
    "        # save variables for the next iteration\n",
    "\n",
    "        cumulated_reward += reward\n",
    "        \n",
    "        # end episode on termination\n",
    "    \n",
    "    cumulated_rewards.append(cumulated_reward)      \n",
    "        \n",
    "    \n",
    "plt.plot(cumulated_rewards)\n",
    "plt.title('Training Progress using alpha = {}:'.format(alpha))\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Cumulated Reward')\n",
    "plt.ylim(-2000, 0) \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-c0313d5e344bffb6",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Use the following code to apply the best policy. Due to the random start position, do $5$ iterations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-1f62b782c8ee360f",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def evaluate_policy(action_values, n_episodes=5):\n",
    "    for e in range(n_episodes):\n",
    "        state_actions = []\n",
    "        rewards = []\n",
    "\n",
    "        pos_map = np.zeros((y_size, x_size))\n",
    "        state = track.reset()\n",
    "        for k in range(200):\n",
    "\n",
    "            pos_map[state[0], state[1]] += 1  # exploration map\n",
    "\n",
    "            action = policy(action_values, state, deterministic=True, epsilon=epsilon)\n",
    "            action = track.action_to_tuple(action)\n",
    "            \n",
    "            state_action = track.state_action(state, action)\n",
    "            state_actions.append(state_action)\n",
    "\n",
    "            state, reward, terminated, truncated, _ = track.step(action)\n",
    "            \n",
    "            rewards.append(reward)\n",
    "\n",
    "            if truncated:\n",
    "                state = track.reset()\n",
    "\n",
    "            if terminated:\n",
    "                print('Done')\n",
    "                break \n",
    "\n",
    "        print('Sample trajectory on learned policy in episode {}:'.format(e))\n",
    "        pos_map = (pos_map > 0).astype(np.int16)\n",
    "        pos_map +=  track.course  # overlay track course\n",
    "        plt.imshow(pos_map, cmap='hot', interpolation='nearest')\n",
    "        plt.show()\n",
    "\n",
    "\n",
    "evaluate_policy(action_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-0c371bcfacdd888b",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 3) Off-Policy $\\varepsilon$-Greedy Control: Q-Learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-f3c161cad2caf81b",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Write a function for a Q-Learning algorithm to solve the rectangular course environment (function will be re-used in the next task).\n",
    "Use again $\\varepsilon = 0.1$, $\\alpha = 0.5$, $\\gamma = 1$ and 500 episodes.\n",
    "\n",
    "Can the resulting greedy policy be expected to be better or worse than the optimal policy trained with Sarsa?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-4d7c8d85aab5d729",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 3) Solution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": true,
     "grade_id": "cell-87575006926b61cc",
     "locked": false,
     "points": 0,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "source": [
    "Algorithm given below.\n",
    "\n",
    "Q-learning determines its action values based on the idea that an optimal greedy policy is acquired in the end (hence the $\\text{max} \\, q$ operation when updating the action values). Thus, the resulting policy is not biased towards softness of the exploration policy, but towards assumption of optimality (maximization bias).\n",
    "In this example the reward is deterministic, so maximization bias is not a problem and $\\alpha$ only influences the speed of learning. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": true,
     "grade_id": "cell-155e68f8d30832aa",
     "locked": false,
     "points": 0,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "def interact(action_values, state, epsilon):\n",
    "    \"\"\"Interact with the environment to get to the next state.\n",
    "    You may reuse the policy function from task 2) within this\n",
    "    function.\n",
    "\n",
    "    Args:\n",
    "        action_values: The action-value estimates\n",
    "        state: The current state before interaction\n",
    "        epsilon: The probability for a random action\n",
    "\n",
    "    Returns:\n",
    "        next_state: The next state after interaction\n",
    "        reward: The reward for the current interaction\n",
    "        terminated: If the goal was reached\n",
    "        truncated: If the boundary of the track was breached\n",
    "        action: The applied action\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "    return next_state, reward, terminated, truncated, action\n",
    "\n",
    "\n",
    "def learn(action_values, state, action, next_state, reward, gamma, alpha):\n",
    "    \"\"\"Learn from the collected data with a q-learning update step.\n",
    "    \n",
    "    Args:\n",
    "        action_values: The action-value estimates before the update step\n",
    "        state: The state before the interaction\n",
    "        action: The chosen action\n",
    "        next_state: The next state after the interaction\n",
    "        reward: The reward for the interaction\n",
    "        gamma: Discount factor\n",
    "        alpha: Step size\n",
    "\n",
    "    Returns:\n",
    "        action_values: The updated action-value estimates\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "    return action_values\n",
    "\n",
    "\n",
    "def q_learning(epsilon, gamma, alpha, episodes, track):\n",
    "    \"\"\"Defines the Q-learning function which performs 𝜀-greedy control on the given track.\n",
    "\n",
    "    Args:\n",
    "        epsilon: Exploration probability\n",
    "        gamma: Discount factor\n",
    "        alpha: Forgetting factor\n",
    "        episodes: Number of evaluated episodes\n",
    "        track: Race track to learn\n",
    "\n",
    "    Returns:\n",
    "        cumulated_rewards: List of accumulated rewards per episode\n",
    "        action_values: The learned action values\n",
    "    \"\"\"\n",
    "    # define track\n",
    "    course = track.course\n",
    "    x_size, y_size = len(course[0]), len(course)\n",
    "    \n",
    "    # Initialise action values \n",
    "    action_values = np.zeros([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY, 3, 3])\n",
    "    \n",
    "    cumulated_rewards = []\n",
    "\n",
    "    for e in tqdm(range(episodes), desc='episode'): \n",
    "        cumulated_reward = 0\n",
    "\n",
    "        pos_map = np.zeros((y_size, x_size))\n",
    "        state = track.reset()   \n",
    "\n",
    "        # episodes do not terminate by time limit\n",
    "        while True:\n",
    "            pos_map[state[0], state[1]] += 1  # exploration map\n",
    "\n",
    "            # YOUR CODE HERE\n",
    "            raise NotImplementedError()\n",
    "        \n",
    "        cumulated_rewards.append(cumulated_reward) \n",
    "        \n",
    "    return cumulated_rewards, action_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# configuration parameters\n",
    "epsilon = 0.1   # exploration probability\n",
    "gamma = 1       # discount factor\n",
    "alpha = 0.5      # forgetting factor\n",
    "episodes = 500 # number of evaluated episodes\n",
    "\n",
    "# define track using the course defined above\n",
    "course = course\n",
    "track = RaceTrackEnv(course)\n",
    "x_size, y_size = len(course[0]), len(course)\n",
    "\n",
    "cumulated_rewards, action_values = q_learning(epsilon, gamma, alpha, episodes, track)\n",
    "\n",
    "plt.plot(cumulated_rewards)\n",
    "plt.title('Training Progress using alpha = {}:'.format(alpha))\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Cumulated Reward')\n",
    "plt.ylim(-2000, 0) \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-2f30a4a48607f604",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Show again the best 5 episodes using the function defined in 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-8f635c16703f4776",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "evaluate_policy(action_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-54ac52d0a2b9c871",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "## 4) Double Q-Learning\n",
    "\n",
    "Now assume the driver had a few beers. Due to the alcohol he/she is not able to perceive the environment in detail.\n",
    "The reward is therefore stochastic whenever an action, that accelerates the car into the direction of a corner, is selected (e.g. [-1,1] in the upper right quadrant).\n",
    "A corresponding stochastic environment is given below. \n",
    "Execute the cell before continuing with the exercise.\n",
    "\n",
    "- Use this environment and try to solve it with the Q-learning function written in 3) using $\\varepsilon = 0.1$, $\\alpha = 0.5$ and $\\gamma = 1$ with the cells given below.\n",
    "\n",
    "A code template is given which repeats the training $3$ times to calculate the mean cumulated reward.\n",
    "\n",
    "\n",
    "- Write a double Q-learning algorithm to solve the above defined course with the given template.\n",
    "Use again $\\varepsilon = 0.1$, $\\alpha = 0.5$ and $\\gamma = 1$.\n",
    "\n",
    "Compare the results."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-b1dc063f23f3e960",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "### Drunken Driver Environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-e57ef2fd78635217",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "from racetrack_environment import StochRaceTrackEnv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-1ea1d57743964d70",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# configuration parameters\n",
    "epsilon = 0.1    # exploration probability\n",
    "gamma = 1        # discount factor\n",
    "alpha = 0.5      # forgetting factor\n",
    "episodes = 1000  # number of evaluated episodes\n",
    "repetitions = 4  # repeats the training to get mean reward\n",
    "\n",
    "# define track using the course defined above\n",
    "track = StochRaceTrackEnv(course=course)\n",
    "\n",
    "reward_trajectories_Q = []\n",
    "policies_Q = []\n",
    "\n",
    "# Runs training repetitions-times\n",
    "for _ in tqdm(range(repetitions), desc='Experiment'):\n",
    "    \n",
    "    # Execute q-learning\n",
    "    cumulated_rewards, action_values = q_learning(epsilon, gamma, alpha, episodes, track)\n",
    "    \n",
    "    # store reward and policy\n",
    "    reward_trajectories_Q.append(cumulated_rewards)\n",
    "    policies_Q.append(action_values)\n",
    "    \n",
    "plt.plot(np.vstack(reward_trajectories_Q).mean(axis=0))\n",
    "plt.title('Q-learning')\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Mean Cumulated Reward')\n",
    "plt.ylim(-400, 0) \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-5796699faabb48d5",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# show episodes\n",
    "# Uses last calulated Q-values to evaluate the policy\n",
    "evaluate_policy(policies_Q[-1], 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": true,
     "grade_id": "cell-caf321513ac011ca",
     "locked": false,
     "points": 0,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "source": [
    "## 4) Solution\n",
    "\n",
    "Double Q-Learning is especially useful if the environment itself is random e.g. in the form of stochastic rewards. But it uses more memory without special upsides in deterministic scenarios."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def learn(action_values1, action_values2, state, action, next_state, reward, gamma, alpha):\n",
    "    \"\"\"Learn from the collected data with a double q-learning update step.\n",
    "\n",
    "    Args:\n",
    "        action_values1: Set 1 of action value estimates before the update\n",
    "        action_values2: Set 2 of action value estimates before the update\n",
    "        state: The state before the interaction\n",
    "        action: The chosen action\n",
    "        next_state: The next state after the interaction\n",
    "        reward: The reward for the interaction\n",
    "        gamma: Discount factor\n",
    "        alpha: Step size\n",
    "\n",
    "    Returns:\n",
    "        action_values1: Set 1 of updated action value estimates\n",
    "        action_values2: Set 2 of updated action value estimates\n",
    "    \"\"\"\n",
    "    # YOUR CODE HERE\n",
    "    raise NotImplementedError()\n",
    "\n",
    "    return action_values1, action_values2\n",
    "\n",
    "\n",
    "def double_q_learning(epsilon, gamma, alpha, episodes, track):\n",
    "    \"\"\"\n",
    "    Defines the double Q-learning function which performs 𝜀-greedy control on the given track.\n",
    "\n",
    "    Args:\n",
    "        epsilon: Exploration probability\n",
    "        gamma: Discount factor\n",
    "        alpha: Forgetting factor\n",
    "        episodes: Number of evaluated episodes\n",
    "        track: Race track to learn\n",
    "    \n",
    "    Returns:\n",
    "        cumulated_rewards: The accumulated rewards per episode \n",
    "        action_values: The action_value estimate\n",
    "    \"\"\"\n",
    "    \n",
    "    # define track\n",
    "    course = track.course\n",
    "    x_size, y_size = len(course[0]), len(course)\n",
    "    \n",
    "    # Initialise action values \n",
    "    action_values1 = np.zeros([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY, 3, 3])\n",
    "    action_values2 = np.zeros([track.bounds[0], track.bounds[1], 1+2*track.MAX_VELOCITY, 1+2*track.MAX_VELOCITY, 3, 3])\n",
    "    \n",
    "    cumulated_rewards = []\n",
    "    \n",
    "    \n",
    "    for e in tqdm(range(episodes), desc='episode'): \n",
    "        cumulated_reward = 0\n",
    "\n",
    "        pos_map = np.zeros((y_size, x_size))\n",
    "        state = track.reset()     \n",
    "        \n",
    "        # episodes do not terminate by time limit\n",
    "        while True:\n",
    "            pos_map[state[0], state[1]] += 1  # exploration map\n",
    "\n",
    "            # YOUR CODE HERE\n",
    "            raise NotImplementedError()\n",
    "        \n",
    "        cumulated_rewards.append(cumulated_reward)      \n",
    "        \n",
    "    \n",
    "    return cumulated_rewards, action_values1+action_values2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-b951254ccc0a37e1",
     "locked": false,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "reward_trajectories_double_Q = []\n",
    "policies_double_Q = []\n",
    "\n",
    "# Runs training repetitions-times\n",
    "for _ in tqdm(range(repetitions), desc='Experiment'):\n",
    "    \n",
    "    # Execute double q-learning\n",
    "    cumulated_rewards, action_values = double_q_learning(epsilon, gamma, alpha, episodes, track)\n",
    "    \n",
    "    # store reward and policy\n",
    "    reward_trajectories_double_Q.append(cumulated_rewards)\n",
    "    policies_double_Q.append(action_values)\n",
    "    \n",
    "plt.plot(np.vstack(reward_trajectories_Q).mean(axis=0), label='Q Learning')\n",
    "plt.plot(np.vstack(reward_trajectories_double_Q).mean(axis=0), label='Double Q')\n",
    "plt.title('Q Learning vs Double Q Learning')\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Mean Cumulated Reward')\n",
    "plt.ylim(-400, 0) \n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-72de103bca16326f",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "source": [
    "Show again the best $5$ episodes using the function defined in 2)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-88f7c2dff8695f42",
     "locked": true,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "# Uses last calulated Q-values to evaluate the policy\n",
    "evaluate_policy(policies_double_Q[-1], 10)"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Create Assignment",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
